library(modelr)
library(recipes)
Perform a basic 70:30 split
set.seed(20180409)
brent_dt %>%
modelr::resample_partition(c(train=0.7,test=0.3)) ->
splits
splits %>%
pluck("train") %>%
as_tibble()->
train_raw
splits %>%
pluck("test") %>%
as_tibble()->
test_raw
Start our recipe for processing our data
train_raw %>%
recipe( .) %>%
add_role(any_purchase, new_role = "outcome") %>%
add_role(email_user_id, new_role = "id") %>%
add_role(everything(),-any_purchase, -email_user_id, new_role = "predictor")->
starter_recipe
starter_recipe
Data Recipe
Inputs:
role #variables
id 1
outcome 1
predictor 188
Perform some steps to remove columns and fill in missing country
starter_recipe %>%
step_rm(has_role("id")) %>%
step_corr(all_numeric()) %>%
step_bagimpute(country_code, seed_val = 42) %>%
step_other(country_code) %>%
step_zv(all_predictors()) %>%
prep(train_raw) ->
filter_recipe
filter_recipe
Data Recipe
Inputs:
role #variables
id 1
outcome 1
predictor 188
Training data contained 69362 data points and 13773 incomplete rows.
Operations:
Variables removed email_user_id [trained]
Correlation filter removed h10_clicks, h11_clicks, h6_clicks, h8_clicks, ... [trained]
Bagged tree imputation for country_code [trained]
Collapsing factor levels for country_code [trained]
Zero variance filter removed no terms [trained]
train_b <- bake(filter_recipe, newdata = train_raw)
test_b <- bake(filter_recipe, newdata = test_raw)
train_b
Perform numeric standardisation steps.
train_b %>%
recipe(any_purchase~.) %>%
step_center(all_numeric()) %>%
step_scale(all_numeric()) %>%
prep(train_b) ->
standardise_recipe
standardise_recipe
Data Recipe
Inputs:
role #variables
outcome 1
predictor 135
Training data contained 69362 data points and no missing data.
Operations:
Centering for member_rating, holiday_open_prop, ... [trained]
Scaling for member_rating, holiday_open_prop, ... [trained]
train_std <- bake(standardise_recipe, train_b)
test_std <- bake(standardise_recipe, test_b)
train_std
set.seed(20180101)
filter_recipe %>%
step_upsample(all_outcomes(), ratio= .25) %>%
prep(retain=TRUE) %>%
juice() %>%
# hack because juice isn't reducing the column set
bake(filter_recipe, .) ->
train_up
print(paste("New rows:", nrow(train_up)- nrow(train_b)))
[1] "New rows: 15726"
Now standardise it
train_ups <- bake(standardise_recipe, train_up)
library(synthpop)
brent_dt %>%
filter(any_purchase=="TRUE") %>%
bake(filter_recipe,.) %>%
syn(k=30000) ->
synth_purchases
Sample(s) of size 30000 will be generated from original data of size 1836.
Variable any_purchase has only one value so its method has been changed to "constant".
Variable any_purchase removed as predictor because only one value.
syn variables
1 member_rating country_code any_purchase X2016.04.Early.Access.Coupon X6.Month.DBA.Training.Plan Blog.Posts DBAreactions.Tuesdays...Fridays First.Responder.Kit.Updates Monday.Links...our.favorite.SQL...tech.news.from.the.week SQL.Server.Updates
Unknown.recd X2017.05.Intersection.Orlando.Pre.Con X2017.05.SQLDay.Poland.Pre.Con X6.Month.DBA.Plan Clients Contact.Form Dell.DBA.Days.2016 EBook.Setup FeedBurner First.Responder.Kit
iTunes.Rating MailOrderDBAs.com Members OptinMonster...BrentOzar.com Sitewide.Footer SQLServerUpdates.com Unknown.Signup.Source..Before.2014.Sept. Unknown.source Watch.Brent.Tune.Queries Webcast.Drawing
Webcast.Registration holiday_open_prop weekday_open_raw weekday_open_prop weekend_open_raw weekend_open_prop h0_opens h1_opens h10_opens h11_opens
h12_opens h13_opens h14_opens h2_opens h3_opens h4_opens h5_opens h6_opens h7_opens h8_opens
h9_opens h0_opensactive h1_opensactive h10_opensactive h11_opensactive h12_opensactive h13_opensactive h14_opensactive h2_opensactive h3_opensactive
h4_opensactive h5_opensactive h6_opensactive h7_opensactive h8_opensactive h9_opensactive h0_opensprop h1_opensprop h10_opensprop h11_opensprop
h12_opensprop h13_opensprop h14_opensprop h2_opensprop h3_opensprop h4_opensprop h5_opensprop h6_opensprop h7_opensprop h8_opensprop
h9_opensprop holiday_clicks_prop weekday_clicks_prop weekend_clicks_raw weekend_clicks_prop h0_clicks h1_clicks h12_clicks h13_clicks h14_clicks
h2_clicks h3_clicks h4_clicks h7_clicks h0_clickactive h1_clickactive h10_clickactive h11_clickactive h12_clickactive h13_clickactive
h14_clickactive h2_clickactive h3_clickactive h4_clickactive h5_clickactive h6_clickactive h7_clickactive h8_clickactive h9_clickactive h0_clickprop
h1_clickprop h10_clickprop h11_clickprop h12_clickprop h13_clickprop h14_clickprop h2_clickprop h3_clickprop h4_clickprop h5_clickprop
h6_clickprop h7_clickprop h8_clickprop h9_clickprop topic_8_B_clicks topic_NA_B_clicks topic_NA_nB_clicks topic_8_B_prop topic_8_nB_prop topic_NA_B_prop
topic_NA_nB_prop topic_8_B_active topic_NA_B_active topic_NA_nB_active never_opened never_clicked
synth_purchases %>%
pluck("syn") %>%
union(train_b) %>%
sample_n(nrow(.)) ->
train_syn
train_syn_std<- bake(standardise_recipe,train_syn)
We’ll want to build models against our training sets so let’s save some code by making a big list we can map models to.
train_sets<-list(
"Basic"=train_b,
"Standardised"=train_std,
"Upsampled"=train_up,
"Upsampled & Standardised"=train_ups,
"Synthesised"=train_syn,
"Synthesised & Standardised" =train_syn_std
)
Let’s remove a high RAM object.
rm("filter_recipe")
library(broom)
library(ggplot2)
train_sets %>%
map(~glm(any_purchase~ member_rating+ First.Responder.Kit+ Unknown.recd+ Monday.Links...our.favorite.SQL...tech.news.from.the.week+ weekday_open_prop + weekday_clicks_prop,
data=.,
family="binomial",
y=FALSE,x=FALSE,model=FALSE)) ->
basic_glm
saveRDS(basic_glm, "../outputs/glms.rds")
basic_glm %>%
map_df(tidy, .id = "set") %>%
filter(term!="(Intercept)") %>%
ggplot(aes(x=term, y=estimate, colour=set)) +
geom_point(alpha=.5, size=3) +
coord_flip() +
ggthemes::theme_fivethirtyeight()+
geom_hline(aes(yintercept=0), colour="darkgrey", linetype="dashed")
Attributes with values consistently above 0 (dashed line) increase likelihood to buy training, and those below reduce the likelihood. The further away from the 0, the greater the impact.
library(optiRum)
vals_to_conv<-seq(-1.5,1,by=0.5)
data_frame(Coefficient=vals_to_conv,
`Odds Ratio (p/p-1)`= round(logit.odd(vals_to_conv),2),
`Probability` = round(logit.prob(vals_to_conv),2))
library(FFTrees)
train_sets %>%
map(~dplyr::select(mutate(., any_purchase=as.logical(any_purchase)),
member_rating:weekend_open_prop,
holiday_clicks_prop:weekend_clicks_prop)) %>%
map(~FFTrees(any_purchase~., .,
goal="bacc",do.comp = FALSE,progress = FALSE)) ->
basic_trees
saveRDS(basic_trees, "../outputs/trees.rds")
basic_trees %>%
map_df(~cbind(.$tree.definitions,.$tree.stats$train),.id = "set") ->
basic_tree_results
basic_tree_results
The chart shows different trees built using the different data sets and the balanced accuracy measure. Balanced accuracy is average of the proportion of purchases correctly classified and the proportion of non-purchases correctly classified.
basic_tree_results %>%
ggplot(aes(x=cues, y=bacc, colour=set)) +
geom_point(alpha=.5, size=3) +
coord_flip() +
ggthemes::theme_fivethirtyeight()
We can look at the difference between the correctly classified purchases (red) and the correctly classified non-purchases (grey) for the trees with the highest balanced accuracy. There’s two visible cases:
Selection one of these would depend on the cost of the low accuracy for non-purchases.
basic_tree_results %>%
top_n(10, bacc) %>%
mutate(cues=stringr::str_wrap(stringr::str_replace_all(cues,";", " "),1)) %>%
dplyr::select(cues, sens, spec, set) %>%
ggplot(aes(x=set, y=spec, ymax=sens, ymin=spec)) +
geom_pointrange(alpha=0.3, size=2)+
geom_point(aes(y=sens), colour="red", size=6)+
facet_wrap(~cues, ncol=4)+
ggthemes::theme_fivethirtyeight()
glmnets take all our variables and construct a model with them. However, a model with 135 coefficients would be pretty crazy. Instead glmnet downweights coefficients towards 0 as it penalises complexity. As such, even though our model notionally contains all our columns not everything will contribute.
library(glmnet)
library(glmnetUtils)
train_sets %>%
map(~cv.glmnet(any_purchase~., data=. ,
family = "binomial",type.measure="class",
standardize = FALSE)) ->
basic_glmnets
saveRDS(basic_glmnets, "../outputs/glmnets.rds")
basic_glmnets %>%
map_df(~rownames_to_column(as.data.frame(as.matrix(coefficients(.))),"col"),.id = "set" ) %>%
rename(coef=`1`) ->
basic_glmnet_coefs
basic_glmnet_coefs %>%
filter(coef!=0.000000e+00) %>%
filter(col!="(Intercept)") %>%
arrange(col) %>%
count(col) %>%
mutate( pane = row_number() %/% 38) ->
panes
basic_glmnet_coefs %>%
inner_join(panes) %>%
arrange(desc(col)) %>%
mutate(col=fct_inorder(col)) %>%
ggplot(aes(x=col, y=coef, colour=set)) +
geom_point() +
coord_flip()+
facet_wrap(~pane,scales = "free", ncol=5)+
ggthemes::theme_fivethirtyeight()
The non-standardised types of samples do not work well with glmnet and typically it would have scaled them. We can see which columns (primarily in the standardised samples) make it our models most commonly. These will be the most predictive columns overall.
panes %>%
arrange(desc(n)) %>%
mutate(col=fct_inorder(col)) %>%
mutate( pane = row_number() %/% 38) %>%
arrange(desc(col)) %>%
mutate(col=fct_inorder(col)) %>%
ggplot(aes(x=col, y=n)) +
geom_col() +
coord_flip()+
facet_wrap(~pane, ncol=5, scales = "free_y")+
ggthemes::theme_fivethirtyeight()
library(h2o)
options("h2o.use.data.table"=TRUE)
h2o.init(max_mem_size = "8G")
H2O is not running yet, starting it now...
Note: In case of errors look at the following log files:
C:\Users\steph\AppData\Local\Temp\RtmpCynpf4/h2o_steph_started_from_r.out
C:\Users\steph\AppData\Local\Temp\RtmpCynpf4/h2o_steph_started_from_r.err
java version "1.8.0_161"
Java(TM) SE Runtime Environment (build 1.8.0_161-b12)
Java HotSpot(TM) 64-Bit Server VM (build 25.161-b12, mixed mode)
Starting H2O JVM and connecting: . Connection successful!
R is connected to the H2O cluster:
H2O cluster uptime: 2 seconds 157 milliseconds
H2O cluster version: 3.16.0.2
H2O cluster version age: 4 months and 12 days !!!
H2O cluster name: H2O_started_from_R_steph_xwr366
H2O cluster total nodes: 1
H2O cluster total memory: 7.11 GB
H2O cluster total cores: 8
H2O cluster allowed cores: 8
H2O cluster healthy: TRUE
H2O Connection ip: localhost
H2O Connection port: 54321
H2O Connection proxy: NA
H2O Internal Security: FALSE
H2O API Extensions: Algos, AutoML, Core V3, Core V4
R Version: R version 3.4.4 (2018-03-15)
train_sets %>%
map(as.h2o) %>%
map(~h2o.automl(y = "any_purchase", training_frame = ., seed = 1313, max_runtime_secs = 180)) ->
basic_automl
|
| | 0%
|
|=========================================================================================| 100%
|
| | 0%
|
|=========================================================================================| 100%
|
| | 0%
|
|=========================================================================================| 100%
|
| | 0%
|
|=========================================================================================| 100%
|
| | 0%
|
|=========================================================================================| 100%
|
| | 0%
|
|=========================================================================================| 100%
|
| | 0%
|
|=== | 3%
|
|==== | 5%
|
|===== | 5%
|
|===== | 6%
|
|====== | 6%
|
|====== | 7%
|
|======= | 7%
|
|======= | 8%
|
|======== | 8%
|
|======== | 9%
|
|======== | 10%
|
|========= | 10%
|
|========== | 11%
|
|========== | 12%
|
|=========== | 12%
|
|=========== | 13%
|
|============ | 13%
|
|============ | 14%
|
|============= | 14%
|
|============= | 15%
|
|============== | 15%
|
|============== | 16%
|
|=============== | 16%
|
|=============== | 17%
|
|================ | 18%
|
|================= | 19%
|
|================= | 20%
|
|================== | 20%
|
|================== | 21%
|
|=================== | 21%
|
|=================== | 22%
|
|==================== | 22%
|
|==================== | 23%
|
|===================== | 23%
|
|===================== | 24%
|
|=========================================================================================| 100%
|
| | 0%
|
|=========================================================================================| 100%
|
| | 0%
|
|=== | 3%
|
|===== | 5%
|
|===== | 6%
|
|====== | 6%
|
|====== | 7%
|
|======= | 7%
|
|======= | 8%
|
|======== | 9%
|
|========= | 10%
|
|========= | 11%
|
|========== | 11%
|
|========== | 12%
|
|=========== | 12%
|
|=========== | 13%
|
|============ | 13%
|
|============ | 14%
|
|============= | 14%
|
|============= | 15%
|
|============== | 15%
|
|============== | 16%
|
|=============== | 16%
|
|=============== | 17%
|
|================ | 18%
|
|================= | 19%
|
|================= | 20%
|
|================== | 20%
|
|================== | 21%
|
|=================== | 21%
|
|=================== | 22%
|
|==================== | 22%
|
|==================== | 23%
|
|===================== | 23%
|
|===================== | 24%
|
|=========================================================================================| 100%
|
| | 0%
|
|=========================================================================================| 100%
|
| | 0%
|
|=== | 3%
|
|==== | 5%
|
|===== | 5%
|
|===== | 6%
|
|====== | 7%
|
|======= | 8%
|
|======== | 8%
|
|======== | 9%
|
|========= | 10%
|
|========== | 11%
|
|========== | 12%
|
|=========== | 12%
|
|=========== | 13%
|
|============ | 13%
|
|============ | 14%
|
|============= | 14%
|
|============= | 15%
|
|============== | 15%
|
|============== | 16%
|
|=============== | 16%
|
|=============== | 17%
|
|================ | 18%
|
|================= | 19%
|
|================= | 20%
|
|================== | 20%
|
|================== | 21%
|
|=================== | 21%
|
|=========================================================================================| 100%
|
| | 0%
|
|=========================================================================================| 100%
|
| | 0%
|
|=== | 3%
|
|==== | 5%
|
|===== | 5%
|
|===== | 6%
|
|====== | 7%
|
|======= | 8%
|
|======== | 8%
|
|======== | 9%
|
|========= | 10%
|
|========== | 11%
|
|========== | 12%
|
|=========== | 12%
|
|=========== | 13%
|
|============ | 13%
|
|============ | 14%
|
|============= | 14%
|
|============= | 15%
|
|============== | 15%
|
|============== | 16%
|
|=============== | 16%
|
|=============== | 17%
|
|================ | 18%
|
|================= | 19%
|
|================= | 20%
|
|================== | 20%
|
|================== | 21%
|
|=================== | 21%
|
|=========================================================================================| 100%
|
| | 0%
|
|=========================================================================================| 100%
|
| | 0%
|
|=== | 3%
|
|==== | 5%
|
|===== | 5%
|
|===== | 6%
|
|====== | 6%
|
|====== | 7%
|
|======= | 7%
|
|======= | 8%
|
|======== | 8%
|
|======== | 9%
|
|======== | 10%
|
|========= | 10%
|
|========= | 11%
|
|========== | 11%
|
|========== | 12%
|
|=========== | 12%
|
|=========== | 13%
|
|============ | 13%
|
|============ | 14%
|
|============= | 14%
|
|============= | 15%
|
|============== | 15%
|
|============== | 16%
|
|=============== | 16%
|
|=============== | 17%
|
|================ | 18%
|
|================= | 19%
|
|================= | 20%
|
|================== | 20%
|
|===================================================================================== | 95%
|
|=========================================================================================| 100%
|
| | 0%
|
|=========================================================================================| 100%
|
| | 0%
|
|=== | 3%
|
|===== | 5%
|
|===== | 6%
|
|====== | 6%
|
|====== | 7%
|
|======= | 7%
|
|======= | 8%
|
|======== | 8%
|
|======== | 9%
|
|======== | 10%
|
|========= | 10%
|
|========= | 11%
|
|========== | 11%
|
|========== | 12%
|
|=========== | 12%
|
|=========== | 13%
|
|============ | 13%
|
|============ | 14%
|
|============= | 14%
|
|============= | 15%
|
|============== | 15%
|
|============== | 16%
|
|=============== | 16%
|
|=============== | 17%
|
|================ | 18%
|
|================= | 19%
|
|================= | 20%
|
|================== | 20%
|
|===================================================================================== | 95%
|
|=========================================================================================| 100%
|
| | 0%
|
|=========================================================================================| 100%
saveRDS(basic_automl, "../outputs/automl.rds")